In [ ]:
from __future__ import division
import codecs
import pickle
import networkx as nx
from collections import Counter
from scipy.stats import kurtosis, skew, variation

rcParams['figure.figsize'] = (12.0, 10.0)
rcParams['font.family'] = 'Times New Roman'

In [ ]:
from os.path import abspath
workspace = "/".join(abspath('.').split('/')[:-1])

Note: Make sure that your workspace sees the root directory of openie_eval.


In [ ]:
from openie_eval.openie_eval import semantic_parsing as sp
from openie_eval.openie_eval import ontologization
reload(sp)
reload(ontologization)

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [ ]:
#keyword = 'carnatic_music'
keyword = 'hindustani_music'

#coref_suffix = ''
coref_suffix = '-coref'

filtered_suffix = ''
#filtered_suffix = '-filtered'

In [ ]:
methods = ['reverb', 'openie', 'semantic-parsing']
labels = ['ReVerb', 'OpenIE 4.0', 'Sem. Parsing']
colors = ['#990033', '#006600', '#330066']

In [ ]:
x = arange(0, 100, 1)
for i in xrange(len(colors)):
    y = (i+1)*x
    plot(x, y, color=colors[i])

Extractions per sentence


In [ ]:
def get_sentence_relations(relations):
    sentence_relations = {}
    for rel in relations:
        if rel['full_sentence'] in sentence_relations.keys():
            sentence_relations[rel['full_sentence']].append(rel['arg1'] + rel['rel'] + rel['arg2'])
        else:
            sentence_relations[rel['full_sentence']] = [rel['arg1'] + rel['rel'] + rel['arg2']]
    return sentence_relations

In [ ]:
figure()
for count in xrange(len(methods)):
    method = methods[count]
    input_file = workspace+'/data/'+method+'/' + keyword + '/relations'+coref_suffix+'.pickle'
    relations = pickle.load(file(input_file))
    sentence_relations = get_sentence_relations(relations)
    n_extractions = [len(v) for v in sentence_relations.values()]
    
    x = arange(1, 30, 1)
    y = [sum(n_extractions >= i) for i in x]
    log_y = log10(y)
    log_y[isinf(log_y)] = 0

#     n_uniq_extractions = [len(set(v)) for v in sentence_relations.values()]
#     uniq_y = [sum(n_extractions >= i) for i in x]
#     log_uniq_y = log10(uniq_y)
#     log_uniq_y[isinf(log_uniq_y)] = 0
    
    plot(x, log_y, '-', color=colors[count], marker='.', label=labels[count], linewidth=2.5)
#     plot(x, log_uniq_y, '--', color=colors[count], marker='.', label=labels[count])
    hold(True)
    
    _mean = mean(n_extractions)
    _variance = variation(n_extractions)
    
    print method, _mean, _variance
legend()
grid()

In [ ]:
fontsize=32
xlabel('Min. no. of extractions', fontsize=fontsize+2)
ylabel('Log. count of sentences', fontsize=fontsize+2)
xlim(1, 15)
ylim(0.69, 3.95)
legend(loc=1, prop={'size': fontsize})

xticks(fontsize=fontsize)
yticks(fontsize=fontsize)

In [ ]:
fname = workspace+'/data/results/quantitative/'+keyword+'/extrations-per-sentence'
savefig(fname+'.pdf', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
savefig(fname+'.png', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)

Extractions per argument


In [ ]:
from collections import Counter

In [ ]:
def get_uniq_relations(relations):
    uniq_relations = []
    uniq_relphrases = []
    for rel in relations:
        relation = [rel['arg1'], rel['rel'], rel['arg2']]
        relphrase = " ".join(relation)
        if relphrase not in uniq_relphrases:
            uniq_relphrases.append(relphrase)
            uniq_relations.append(relation)
    return uniq_relations

In [ ]:
fig = figure()
ax = fig.add_subplot(1,1,1)

for count in xrange(len(methods)):
    method = methods[count]
    
    input_file = workspace+'/data/'+method+'/' + keyword + '/relations'+coref_suffix+'.pickle'
    
    #all relations
    relations = pickle.load(file(input_file))
    arguments = [rel['arg1'] for rel in relations]
    counter = Counter(arguments)
    n_extractions = array(counter.values())

    x = concatenate((arange(0, 100, 20), arange(100, 1501, 50)))
    y = [sum(n_extractions >= i) for i in x]
    
    log_y = log10(y)
    log_y[isinf(log_y)] = 0
    #log_y = log_y/max(log_y)
    
    plot(x, log_y, '-', color=colors[count], marker='.', label=labels[count], linewidth=2.5)
    
    _mean = mean(n_extractions)
    _variance = variation(n_extractions)
    
    print method, _mean, _variance
    print y
    
    #uniq relations
    relations = get_uniq_relations(relations)
    arguments = [rel[0] for rel in relations]
    counter = Counter(arguments)
    n_extractions = array(counter.values())

    x = concatenate((arange(0, 100, 20), arange(100, 1501, 50)))
    x[0] = 1
    y = [sum(n_extractions >= i) for i in x]
    
    log_y = log10(y)
    log_y[isinf(log_y)] = 0
    #log_y = log_y/max(log_y)
    
    plot(x, log_y, '--', color=colors[count], marker='.', linewidth=2.5)
    
    _mean = mean(n_extractions)
    _variance = variation(n_extractions)
    
    print method, _mean, _variance
    print y
    
grid()

In [ ]:
fontsize=32
xlim(1, 350)
xlabel('Min. no. of extractions', fontsize=fontsize+2)
ylabel('Log. count of entities', fontsize=fontsize+2)

#Get artists and labels for legend
handles, _labels = ax.get_legend_handles_labels()

#Create custom artists
custom_artists = []
custom_artists.append(plt.Line2D((0,1),(0,0), color='k', linestyle='-'))
custom_artists.append(plt.Line2D((0,1),(0,0), color='k', linestyle='--'))

ax.legend(handles+custom_artists,
          _labels + ['All', 'Unique'],
          loc='upper right', prop={'size': fontsize})

xticks(fontsize=fontsize)
yticks(fontsize=fontsize)

In [ ]:
fname = workspace+'/data/results/quantitative/'+keyword+'/extrations-per-argument'

savefig(fname+'.pdf', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
savefig(fname+'.png', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)

Extractions per relation-type


In [ ]:
fig = figure()
ax = fig.add_subplot(1,1,1)

for count in xrange(len(methods)):
    method = methods[count]
    label = labels[count]
    
    input_file = workspace+'/data/'+method+'/' + keyword + '/relations'+coref_suffix+'.pickle'
    relations = pickle.load(file(input_file))
    relation_types = [lemmatizer.lemmatize(rel['rel'], pos='v') for rel in relations]    
    counter = Counter(relation_types)
    n_extractions = array(counter.values())

    x = concatenate((arange(0, 100, 20), arange(100, 1201, 50)))
    y = [sum(n_extractions >= i) for i in x]
    
    log_y = log10(y)
    log_y[isinf(log_y)] = 0
    #log_y = log_y/max(log_y)
    
    plot(x, log_y, '-', color=colors[count], marker='.', label=labels[count], linewidth=2.5)
    
    _mean = mean(n_extractions)
    _variance = variation(n_extractions)
    
    print method, _mean, _variance
    
    relations = get_uniq_relations(relations)
    relation_types = [lemmatizer.lemmatize(rel[1], pos='v') for rel in relations]
    counter = Counter(relation_types)
    n_extractions = array(counter.values())

    x = concatenate((arange(0, 100, 20), arange(100, 1201, 50)))
    y = [sum(n_extractions >= i) for i in x]
    
    log_y = log10(y)
    log_y[isinf(log_y)] = 0
    #log_y = log_y/max(log_y)
    
    plot(x, log_y, '--', color=colors[count], marker='.', linewidth=2.5)
    
    _mean = mean(n_extractions)
    _variance = variation(n_extractions)
    
    print method, _mean, _variance

grid()

In [ ]:
fontsize=32
xlim(1, 500)
xlabel('Min. no. of extractions', fontsize=fontsize+2)
ylabel('Log. count of relation types', fontsize=fontsize+2)

#Get artists and labels for legend
handles, _labels = ax.get_legend_handles_labels()

#Create custom artists
custom_artists = []
custom_artists.append(plt.Line2D((0,1),(0,0), color='k', linestyle='-'))
custom_artists.append(plt.Line2D((0,1),(0,0), color='k', linestyle='--'))

ax.legend(handles+custom_artists,
          _labels + ['All', 'Unique'],
          loc='upper right', prop={'size': fontsize})

xticks(fontsize=fontsize)
yticks(fontsize=fontsize)

In [ ]:
fname = workspace+'/data/results/quantitative/'+keyword+'/extrations-per-reltype'

savefig(fname+'.pdf', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
savefig(fname+'.png', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)

Subsumption relations

  • Unique instances per subsumption type

In [ ]:
fig = figure()
ax = fig.add_subplot(1,1,1)

for count in xrange(len(methods)):
    method = methods[count]
    
    input_file = workspace+'/data/'+method+'/' + keyword + '/relations'+coref_suffix+'.pickle'
    
    #all relations
    relations = pickle.load(file(input_file))
    classes = []
    for r in relations:
        if lemmatizer.lemmatize(r['rel'], pos='v') == 'is a' or lemmatizer.lemmatize(r['rel'], pos='v') == 'be':
            classes.append(r['arg2'])
    counter = Counter(classes)
    n_extractions = array(counter.values())

    x = concatenate((arange(0, 100, 20), arange(100, 501, 50)))
    y = [sum(n_extractions >= i) for i in x]
    
    log_y = log10(y)
    log_y[isinf(log_y)] = 0
    #log_y = log_y/max(log_y)
    
    plot(x, log_y, '-', color=colors[count], marker='.', label=labels[count], linewidth=2.5)
    
    _mean = mean(n_extractions)
    _variance = variation(n_extractions)
    
    print method, _mean, _variance
    print y
    
    #uniq relations
    relations = get_uniq_relations(relations)
    classes = []
    for r in relations:
        if lemmatizer.lemmatize(r[1], pos='v') == 'is a' or lemmatizer.lemmatize(r[1], pos='v') == 'be':
            classes.append(r[2])
    counter = Counter(classes)
    n_extractions = array(counter.values())

    x = concatenate((arange(0, 100, 20), arange(100, 501, 50)))
    x[0] = 1
    y = [sum(n_extractions >= i) for i in x]
    
    log_y = log10(y)
    log_y[isinf(log_y)] = 0
    #log_y = log_y/max(log_y)
    
    plot(x, log_y, '--', color=colors[count], marker='.', linewidth=2.5)
    
    _mean = mean(n_extractions)
    _variance = variation(n_extractions)
    
    print method, _mean, _variance
    print y
    
grid()

In [ ]:
fontsize=32
xlim(1, 100)
xlabel('Min. no. of extractions', fontsize=fontsize+2)
ylabel('Log. count of concepts', fontsize=fontsize+2)

#Get artists and labels for legend
handles, _labels = ax.get_legend_handles_labels()

#Create custom artists
custom_artists = []
custom_artists.append(plt.Line2D((0,1),(0,0), color='k', linestyle='-'))
custom_artists.append(plt.Line2D((0,1),(0,0), color='k', linestyle='--'))

ax.legend(handles+custom_artists,
          _labels + ['All', 'Unique'],
          loc='upper right', prop={'size': fontsize})

xticks(fontsize=fontsize)
yticks(fontsize=fontsize)

In [ ]:
fname = workspace+'/data/results/quantitative/'+keyword+'/extrations-per-class'

savefig(fname+'.pdf', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)
savefig(fname+'.png', dpi=200, facecolor='w', edgecolor='w', orientation='landscape', 
        papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1)

In [ ]: